In cytolab/RAPID: Risk Assessment Population Identification

Risk Assessment Population IDentification (RAPID) Workflow

knitr::opts_chunk$set(echo = TRUE)

# uncomment lines below to install packages
# install.packages("survminer", repos = "http://cran.us.r-project.org")
# install.packages("Rtsne", repos = "http://cran.us.r-project.org")
# install.packages("devtools", repos = "http://cran.us.r-project.org")
# if (!require("BiocManager", quietly = TRUE))
#    install.packages("BiocManager")
# BiocManager::install("cytoMEM")
# install.packages("tidyverse", repos = "http://cran.us.r-project.org")
# if (!requireNamespace("BiocManager", quietly = TRUE))
#    install.packages("BiocManager")
# BiocManager::install("flowCore")
# BiocManager::install("FlowSOM")
# BiocManager::install("Biobase")
# install.packages("ggpubr", repos = "http://cran.us.r-project.org")

# uncomment line below to install RAPID
# devtools::install_github("cytolab/RAPID")

# load packages into the working library
library(devtools)
library(cytoMEM)
library(RAPID)
library(tidyverse)
library(ggpubr)
library(flowCore)
library(Biobase)
library(FlowSOM)
library(survival)
library(survminer)
library(plyr)
library(Rtsne)
library(cowplot)
library(RColorBrewer)

# set working directory 
setwd(paste(getwd(),"/data_files/gbm_dataset/", sep = ""))

# set output file name tag 
output_filename = "_RAPID"

# read data into R
data.set <-  dir(pattern="*.fcs")
data <- lapply(lapply(data.set,read.FCS),exprs)
combined.patient.data = as.data.frame(do.call(rbind, mapply(cbind, data, "FILE_ID"= c(1:length(data)), SIMPLIFY=F)))
names <- colnames(combined.patient.data) 
colnames(combined.patient.data) =c((read.FCS(data.set[[1]])@parameters@data[["desc"]]),"FILE_ID")

# to see patient order, uncomment and run line below
# data.set

# create varible for survival or clinical data
OS.data.set <-  dir(pattern="*.csv")
OS.data = read.csv(OS.data.set)

# selecting t-SNE columns from data set
tsne.data <- combined.patient.data %>%
  select(ends_with('_2'))

# transform other data 
transformed.data <- combined.patient.data %>%
  select(-contains('FILE_ID')) %>%
  mutate_all(function(x) asinh(x/5))

# choose markers to use for variance calculation and transform
chosen.markers <- transformed.data %>%
  select(contains('(v2)'))

# create variable for markers to use in MEM
MEMdata = cbind(chosen.markers,transformed.data$`CycinB1-139`)

setwd(paste(getwd(),"/data_files/gbm_dataset/", sep = ""))
dir.create("./output files/")


range <- apply(apply(tsne.data, 2, range), 2, diff)
graphical.ratio <- (range[1] / range[2])

# UMAP flat dot plot and density dot plot (1 dot = 1 cell)
tsne.plot <- data.frame(x = tsne.data[, 2], y = tsne.data[, 1])

# density dot plot
ggplot(tsne.plot, aes(x = x, y = y)) + 
  coord_fixed(ratio = graphical.ratio)+ geom_point(size = 0.5) +geom_density_2d_filled(bins = 29) +scale_fill_manual(values = c("NA","NA","NA","NA","NA","NA","NA",viridis::viridis(24,option = "A"))) +
labs(x = "t-SNE 1", y = "t-SNE 2", 
  title = "Density on t-SNE Axes") + theme_bw()  + 
  theme(panel.grid.major = element_blank(),
              panel.grid.minor = element_blank())+theme(legend.position = "none") +
  labs(caption = "Data from Sinnaeve et al., eLife 2020")

tsne.by.marker<-as_tibble(tsne.data) %>%
  bind_cols(MEMdata)  %>%
  gather(channel, intensity, -tSNE1_2, -tSNE2_2) %>%
  mutate(across(channel,factor))%>%
  group_split(channel) %>%
  map(
    ~ggplot(.,aes(x= tSNE2_2, y= tSNE1_2, col = intensity)) +
  geom_point(size = 2) +
  scale_color_gradientn(
    colours = colorRampPalette(rev(brewer.pal(n = 11, name = "Spectral")))(5))+
  facet_grid(~ channel, labeller = function(x) label_value(x, multi_line = FALSE)) +
  coord_fixed() +
  theme_bw()+
  theme(strip.text.x = element_text(size = 20),legend.title=element_blank()))%>%
  plot_grid(plotlist = ., align = 'hv', ncol = 8)

png(paste("./output files/",strftime(Sys.time(),"%Y-%m-%d_%H%M%S"),"t-SNE on transformed data.png"),height = 2000,width = 4000)
print(tsne.by.marker)
dev.off()

# find ideal numbers of clusters and create cluster ID data

# Here we find 44 clusters, so below is the code to find those 44 clusters
mat.input <- as.matrix(tsne.data)
metadata <- data.frame(name = dimnames(mat.input)[[2]], desc = dimnames(mat.input)[[2]])
metadata$range <- apply(apply(mat.input, 2, range), 2, diff)
metadata$minRange <- apply(mat.input, 2, min)
metadata$maxRange <- apply(mat.input, 2, max)
input.flowframe <- new("flowFrame", exprs=mat.input,parameters = AnnotatedDataFrame(metadata))
fSOM <- FlowSOM(input.flowframe, scale = TRUE, colsToUse = c(1:2), nClus    = 44, seed = 38)
optimized.cluster.data <- as.vector(as.numeric(GetMetaclusters(fSOM)))

# To run the optimization, uncomment and run the line below
# optimized.cluster.data = optimize_FlowSOM_clusters(tsne.data,chosen.markers, N = 50, seed = 38)

# plot optimized FlowSOM clusters
FlowSOM_clusters_plot <- plot_clusters(x = tsne.data[,2],y = tsne.data[,1], clusters = as.factor(optimized.cluster.data),xlab ="t-SNE 2",ylab = "t-SNE 1",legendtitle = "FlowSOM Cluster",title = "Cancer Cells t-SNE with FlowSOM Clusters")
FlowSOM_clusters_plot

# cluster ID in 1st column and patient ID in 2nd column
cluster.patient.data = as.data.frame(cbind(optimized.cluster.data,combined.patient.data$FILE_ID))

# calculate survival statistics
survival_stats <- subset_survival_analysis(cluster.patient.data, OS.data, clinical_col = 3, status_col = 4)

# plot significant survival curves
survival_plots <- plot_survival_curves(survival_stats, cluster.patient.data, OS.data, clinical_col = 3, status_col = 4)
survival_plots

# find significant subsets and assign prognostic value (1 = none, 2 = positive prog., 3 = negative prog.)
prognostic_subsets = significant_clusters(survival_stats, cluster_data = optimized.cluster.data)

# plot significant subsets
RAPID_plot <- plot_sig_clusters(x = tsne.data[,2], y = tsne.data[,1], clusters = optimized.cluster.data, survival_stats,xlab ="t-SNE 2",ylab = "t-SNE 1",legendtitle = "Prognostic Subsets",title = "Cancer Cells t-SNE with Prognostic Populations")
RAPID_plot

setwd(paste(getwd(),"/data_files/gbm_dataset/", sep = ""))

cluster = optimized.cluster.data
MEM.data = cbind(MEMdata,cluster)

# use MEM to find marker enrichments for FlowSOM clusters

# identity proteins and signaling proteins
MEM.values.p = MEM(MEM.data, transform = FALSE, cofactor = 0, choose.markers = FALSE, markers = "1:4,8,11:16,18:21,24",choose.ref = FALSE, zero.ref = FALSE, rename.markers = FALSE, new.marker.names = "TUJ1,CD117,S100B,CD34,NCAM,CD49F,CD133,PDGFRa,SOX2,CD15,EGFR,CD171,Nestin,CD44,GFAP,HLA-DR",file.is.clust = FALSE, add.fileID = FALSE, IQR.thresh = NULL)

MEM.values.s = MEM(MEM.data, transform = FALSE, cofactor = 0, choose.markers = FALSE, markers = "5:7,9:10,17,22:23,25",choose.ref = FALSE, zero.ref = FALSE, rename.markers = FALSE, new.marker.names = "p-STAT5,p-AKT,p-STAT1,p-p38,p-STAT3,p-NFkB,p-ERK,p-S6,CyclinB1",file.is.clust = FALSE, add.fileID = FALSE, IQR.thresh = NULL)

# build MEM heatmap and output enrichment scores
build_heatmaps(MEM.values.p, cluster.MEM = "both", cluster.medians = "none", 
display.thresh = 1, output.files = TRUE, labels = FALSE, only.MEMheatmap = FALSE)

build_heatmaps(MEM.values.s, cluster.MEM = "both", cluster.medians = "none", 
display.thresh = 0, output.files = TRUE, labels = FALSE, only.MEMheatmap = FALSE)

setwd(paste(getwd(),"/data_files/gbm_dataset/", sep = ""))

# create output files folder by uncommenting and running line below 
dir.create(file.path(getwd(), "output files"), showWarnings = FALSE)

# export figures to PDF
ggexport(FlowSOM_clusters_plot,RAPID_plot,survival_plots,filename = paste("./output files/",strftime(Sys.time(),"%Y-%m-%d_%H%M%S"),output_filename,".pdf",sep=""), width = 7.2, height = 5.4)

combined.patient.data = as.data.frame(do.call(rbind, mapply(cbind, data, "FILE_ID"= c(1:length(data)), SIMPLIFY=F)))
data.to.fcs = cbind(combined.patient.data,prognostic_subsets)
name<-c(names[1:(ncol(combined.patient.data)-1)],"Cluster","Prog_Status")

# export 
separate.fcs.files = split(data.to.fcs,data.to.fcs$`FILE_ID`)
for (i in 1:length(separate.fcs.files)){
reduce.data = subset(separate.fcs.files[[i]], select=-c(`FILE_ID`))
mat.input<- as.matrix(reduce.data)
metadata <- data.frame(name = dimnames(mat.input)[[2]], desc = name)
metadata$range <- apply(apply(mat.input, 2, range), 2, diff)
metadata$minRange <- apply(mat.input, 2, min)
metadata$maxRange <- apply(mat.input, 2, max)
input.flowframe <- new("flowFrame", exprs=mat.input,parameters = AnnotatedDataFrame(metadata))  
newname  = str_remove(data.set[i], ".fcs")
new.filename = paste0("./output files/",strftime(Sys.time(),"%Y-%m-%d_%H%M%S"),"_",newname,"_RAPID.fcs",sep="")
write.FCS(input.flowframe,filename = new.filename)
print(paste("FCS file ",i," done", sep = ""))}

# print session information
sessionInfo()